clear all
set more off, perm
set mem 10000000
set matsize 10000
version 13 

****************************************************************** 
*** Construct full dataset (Census, RGGVY, lights, habitations) **
****************************************************************** 

** Set file paths
do "$path_code/paths.do"

********************************************************************************
********************************************************************************

** Step 1: Prep nighttime light data, creating treatment indicators
{
use "$nl/nightlights_outcomes_stable.dta", clear
keep c_code01 lights_max*
foreach v of varlist lights_max* {
  local v2 = subinstr("`v'","lights_","lights_sta_",1)
	rename `v' `v2'
}

merge 1:1 c_code using "$nl/nightlights_outcomes_x_pct.dta", nogen 
keep c_code01 lights_sta_max* lights_max*
foreach v of varlist lights_max* {
  local v2 = subinstr("`v'","lights_","lights_pct_",1)
	rename `v' `v2'
}

merge 1:1 c_code using "$nl/nightlights_outcomes_avg_vis.dta", nogen
merge 1:m c_code01 using "$nl/attrtables_all.dta", keep(1 3) nogen keepusing(tot_pop f_area)
duplicates drop c_code01, force
merge 1:1 c_code01 using "$path/data/village gis maps/attrtables_lat_lon.dta", keep(1 3) nogen
rename bk_code bk_code_NL
replace state = upper(state)
replace state = "JAMMU & KASHMIR" if substr(state,1,5)=="JAMMU"

*Drop outcomes we won't be using, to save memory
drop *majority* *maj *min *median lights_min???? lights_majority???? lights_median???? biggestdiff*
drop diff*

*Define post and differenced treatment variables
foreach m of newlist mean max {
	gen post_avg_`m'_11 		= lights_`m'2011
	gen post_avg_`m'_10_11 = (lights_`m'2010 + lights_`m'2011)/2 
	gen post_avg_`m'_11_12 = (lights_`m'2011 + lights_`m'2012)/2
	gen post_avg_`m'_10_12 = (lights_`m'2010 + lights_`m'2011 + lights_`m'2012)/3 
	gen post_avg_`m'_09_12 = (lights_`m'2009 + lights_`m'2010 + lights_`m'2011 + lights_`m'2012)/4
	gen post_avg_`m'_10_13 = (lights_`m'2010 + lights_`m'2011 + lights_`m'2012 + lights_`m'2013)/4
	gen post_avg_`m'_09_13 = (lights_`m'2009 + lights_`m'2010 + lights_`m'2011 + lights_`m'2012 + lights_`m'2013)/5

	gen pre_avg_`m'_01 		= lights_`m'2001
	gen pre_avg_`m'_00_01 = (lights_`m'2000 + lights_`m'2001)/2 
	gen pre_avg_`m'_01_02 = (lights_`m'2001 + lights_`m'2002)/2
	gen pre_avg_`m'_00_02 = (lights_`m'2000 + lights_`m'2001 + lights_`m'2002)/3 
	gen pre_avg_`m'_99_02 = (lights_`m'1999 + lights_`m'2000 + lights_`m'2001 + lights_`m'2002)/4
	gen pre_avg_`m'_00_03 = (lights_`m'2000 + lights_`m'2001 + lights_`m'2002 + lights_`m'2003)/4
	gen pre_avg_`m'_99_03 = (lights_`m'1999 + lights_`m'2000 + lights_`m'2001 + lights_`m'2002 + lights_`m'2003)/5

	gen diff_avg_`m'_11 	 = post_avg_`m'_11    - pre_avg_`m'_01
	gen diff_avg_`m'_10_11 = post_avg_`m'_10_11 - pre_avg_`m'_00_01
	gen diff_avg_`m'_11_12 = post_avg_`m'_11_12 - pre_avg_`m'_01_02 
	gen diff_avg_`m'_10_12 = post_avg_`m'_10_12 - pre_avg_`m'_00_02
	gen diff_avg_`m'_09_12 = post_avg_`m'_09_12 - pre_avg_`m'_99_02
	gen diff_avg_`m'_10_13 = post_avg_`m'_10_13 - pre_avg_`m'_00_03
	gen diff_avg_`m'_09_13 = post_avg_`m'_09_13 - pre_avg_`m'_99_03
}

*Define logged treatment variables
foreach m of newlist mean max {
  gen ln_diff_avg_`m'_11     = ln(post_avg_`m'_11) - ln(pre_avg_`m'_01)
  gen ln_diff_avg_`m'_10_12  = ln(post_avg_`m'_10_12) - ln(pre_avg_`m'_00_02)
  gen ln_diff_avg_`m'_09_13  = ln(post_avg_`m'_09_13) - ln(pre_avg_`m'_99_03)
}

*Define projected treatment variables
foreach m of newlist mean max {
	reg lights_`m'2011 lights_`m'2013 lights_`m'2012 lights_`m'2010 lights_`m'2009 [aw=f_area]
	predict lights_`m'2011_hat if e(sample), xb
	reg lights_`m'2001 lights_`m'1999 lights_`m'2000 lights_`m'2002 lights_`m'2003 [aw=f_area]
	predict lights_`m'2001_hat if e(sample), xb
	gen diff_`m'_proj_01_11 = lights_`m'2011_hat - lights_`m'2001_hat
	gen ln_diff_`m'_proj_01_11 = ln(lights_`m'2011_hat) - ln(lights_`m'2001_hat)
}

foreach m of newlist max {
	reg lights_`m'2010 lights_`m'2012 lights_`m'2011 lights_`m'2009 lights_`m'2008 [aw=f_area]
	predict lights_`m'2010_hat if e(sample), xb
	reg lights_`m'2009 lights_`m'2011 lights_`m'2010 lights_`m'2008 lights_`m'2007 [aw=f_area]
	predict lights_`m'2009_hat if e(sample), xb
	reg lights_`m'2008 lights_`m'2010 lights_`m'2009 lights_`m'2007 lights_`m'2006 [aw=f_area]
	predict lights_`m'2008_hat if e(sample), xb
	reg lights_`m'2007 lights_`m'2009 lights_`m'2008 lights_`m'2006 lights_`m'2005 [aw=f_area]
	predict lights_`m'2007_hat if e(sample), xb
	reg lights_`m'2006 lights_`m'2008 lights_`m'2007 lights_`m'2005 lights_`m'2004 [aw=f_area]
	predict lights_`m'2006_hat if e(sample), xb
	reg lights_`m'2005 lights_`m'2007 lights_`m'2006 lights_`m'2004 lights_`m'2003 [aw=f_area]
	predict lights_`m'2005_hat if e(sample), xb
	reg lights_`m'2004 lights_`m'2006 lights_`m'2005 lights_`m'2003 lights_`m'2002 [aw=f_area]
	predict lights_`m'2004_hat if e(sample), xb
	reg lights_`m'2003 lights_`m'2005 lights_`m'2004 lights_`m'2002 lights_`m'2001 [aw=f_area]
	predict lights_`m'2003_hat if e(sample), xb
	reg lights_`m'2002 lights_`m'2004 lights_`m'2003 lights_`m'2001 lights_`m'2000 [aw=f_area]
	predict lights_`m'2002_hat if e(sample), xb
	reg lights_`m'2000 lights_`m'2002 lights_`m'2001 lights_`m'1999 lights_`m'1998 [aw=f_area]
	predict lights_`m'2000_hat if e(sample), xb
	reg lights_`m'1999 lights_`m'2001 lights_`m'2000 lights_`m'1998 [aw=f_area]
	predict lights_`m'1999_hat if e(sample), xb
	reg lights_`m'1998 lights_`m'2000 lights_`m'1999 [aw=f_area]
	predict lights_`m'1998_hat if e(sample), xb
	reg lights_`m'2012 lights_`m'2013 lights_`m'2011 lights_`m'2010 [aw=f_area]
	predict lights_`m'2012_hat if e(sample), xb
	reg lights_`m'2011 lights_`m'2012 lights_`m'2010 [aw=f_area]
	predict lights_`m'2011_1_hat if e(sample), xb
	reg lights_`m'2001 lights_`m'2002 lights_`m'2000 [aw=f_area]
	predict lights_`m'2001_1_hat if e(sample), xb
	
	reg lights_sta_`m'2011 lights_sta_`m'2013 lights_sta_`m'2012 lights_sta_`m'2010 lights_sta_`m'2009 [aw=f_area]
	predict lights_sta_`m'2011_hat if e(sample), xb
	reg lights_sta_`m'2001 lights_sta_`m'1999 lights_sta_`m'2000 lights_sta_`m'2002 lights_sta_`m'2003 [aw=f_area]
	predict lights_sta_`m'2001_hat if e(sample), xb

	reg lights_pct_`m'2011 /*lights_pct_`m'2013*/ lights_pct_`m'2012 lights_pct_`m'2010 lights_pct_`m'2009 [aw=f_area]
	predict lights_pct_`m'2011_hat if e(sample), xb
	reg lights_pct_`m'2001 lights_pct_`m'1999 lights_pct_`m'2000 lights_pct_`m'2002 lights_pct_`m'2003 [aw=f_area]
	predict lights_pct_`m'2001_hat if e(sample), xb
}

}
	
********************************************************************************
********************************************************************************

** Step 2: Merge in Census panel dataset and Habitation merge results, 
{
merge 1:m st_code dt_code vi_code using "$panel/census_panel_2001_2011.dta", gen(merge_NL) keep(2 3)

merge 1:1 pca01_id using "$panel/pca_2001_names_hab_merge_final.dta", nogen keep(1 3)

*Define population mismatch variable between habitation populations and official census populations
gen pop_mismatch20 = 1 if h3v_id!=. | h9v_id!=.
replace pop_mismatch20 = 0 if abs(totp_h3-tot_p)/tot_p<=0.20
replace pop_mismatch20 = 0 if abs(totp_h9-tot_p)/tot_p<=0.20
replace pop_mismatch20 = 0 if abs(totp_h3-tot_p11)/tot_p<=0.20
replace pop_mismatch20 = 0 if abs(totp_h9-tot_p11)/tot_p<=0.20

*Define pooled labor variables
foreach y in "01" "11" {
  foreach p in p m f {
    gen work_main_ag_`p'_`y' = work_main_al_`p'_`y' + work_main_cl_`p'_`y'
	  gen work_marg_ag_`p'_`y' = work_marg_al_`p'_`y' + work_marg_cl_`p'_`y'
  }
}
foreach y in "01" "11" {
  foreach p in p m f {
    foreach w in cl al hh ot ag {
		  gen work_pooled_`w'_`p'_`y' = work_main_`w'_`p'_`y' + work_marg_`w'_`p'_`y'
			*gen work_pooled_workers_`w'_`p'_`y' = work_pooled_`w'_`p'_`y'/work_`p'_`y'
	  }
  }
}

*Define labor ratios
foreach y in "01" "11" {
  gen pct_scst_`y' = pct_sc_`y' + pct_st_`y'
  gen pct_ag_al_`y' = work_pooled_al_p_`y'/work_pooled_ag_p_`y'
  gen pct_ag_main_`y' = work_main_ag_p_`y'/work_pooled_ag_p_`y'
  gen pct_work_ag_`y' = work_pooled_ag_p_`y'/work_p_`y'
  gen pct_work_main_`y' = work_main_p_`y'/work_p_`y'
}

*Define population variables
foreach p in p m f {
  gen pop_growth_`p'_11 = (tot_`p'11-tot_`p')/tot_`p'
}
gen pop_non_zero = tot_m!=0 & tot_f!=0 & tot_m11!=0 & tot_f11!=0 & tot_m!=. & tot_f!=. & tot_m11!=. & tot_f11!=.
gen ln_area_01 = ln(vd_geo_a_01)

*Pool HPCA cooking and lighting variables
foreach y in "01" "11" {
  gen hpca`y'_h_size_avg = hpca`y'_h_size_1 + hpca`y'_h_size_2*2 + hpca`y'_h_size_3*3 + hpca`y'_h_size_4*4 + hpca`y'_h_size_5*5 + hpca`y'_h_size_6_8*7 + hpca`y'_h_size_9*9
  egen hpca`y'_cook_good = rowtotal(hpca`y'_cook_elec hpca`y'_cook_lpg_png hpca`y'_cook_biog)
  gen hpca`y'_cook_bad = 1 - hpca`y'_cook_good - hpca`y'_cook_no
	egen hpca`y'_msl_good = rowtotal(hpca`y'_msl_sol hpca`y'_msl_elec)
	gen hpca`y'_msl_bad = 1 - hpca`y'_msl_good -  hpca`y'_msl_nl
}

*Define geographic variables for heterogeneity analysis
replace vdpr_tra_d_app_pr_01 = 0 if vdpr_tra_d_app_pr_01==. & vdpr_tra_d_app_mr_01==1
foreach y in "01" "11" {
  gen pct_irr_`y' = min(vd_geo_a_irr_`y'/vd_geo_a_`y',1)
  replace pct_irr_`y' = . if vd_geo_a_`y'==. | vd_geo_a_`y'==0
  gen pct_irr_twell_`y' = min(vd_geo_a_irr_w_twell_`y'/vd_geo_a_`y',1)
  replace pct_irr_twell_`y' = . if vd_geo_a_`y'==. | vd_geo_a_`y'==0
  gen pct_irr_irr_twell_`y' = min(vd_geo_a_irr_w_twell_`y'/vd_geo_a_irr_`y',1)
  replace pct_irr_irr_twell_`y' = . if vd_geo_a_irr_`y'==. | vd_geo_a_irr_`y'==0
}
gen delta_twell = pct_irr_irr_twell_11-pct_irr_irr_twell_01
egen vdp_wat_d_any_sum_01 = rowmax(vd_wat_d_*_sum_01)
gen pct_sown_11 = min(vdp_geo_a_sown_11/vd_geo_a_11,1)
replace vd_com_d_post_off_01 = 1 if vd_com_d_post_tele_01==1
replace vd_com_d_post_off_11 = 1 if vd_com_d_post_tele_11==1

}

********************************************************************************
********************************************************************************

** Step 3: Construct RD Variables and Define Reduced-Form Sample 
{

*Define who's in the sample, by merge outcome
gen sample = merge_NL==3
replace sample = 0 if tot_p==.
replace sample = 0 if vdpr4==. | vplan4==. 

gen sample_h = sample
replace sample_h = 0 if match_h3==0 & match_h9==0
replace sample_h = 1 if match_h3==1 | match_h9==1
replace sample_h = 0 if vdpr4==. | vplan4==. 
assert (totp_h3!=. & maxp_h3!=.) | (totp_h9!=. & maxp_h9!=.) & sing_h!=. if sample_h==1

*Flag untreated districts to remove from sample
gen treat_cov = v_id_master!=.   // dropping missing districts from RGGVY microdata is equivalent to doing so using the RGGVY district-level aggregates
egen max_treat = max(treat_cov), by(st_code dt_code)
replace sample = 0 if max_treat ==.
replace sample_h = 0 if max_treat ==.

*Create RD variables for regressions
gen p_300_X   = tot_p-300
gen p_300_D   = p_300_X>=0
gen p_300_DX  = p_300_X*p_300_D
gen band300_p_50 = abs(tot_p-300)<=50
gen band300_p_100 = abs(tot_p-300)<=100
gen band300_p_150 = abs(tot_p-300)<=150
gen band300_p_200 = abs(tot_p-300)<=200
egen stdt = group(st_code dt_code)

*Create bins for RD pictures, of varying sizes (making sure 300 is to the bin to the right of the threshold)
foreach b in 25 20 15 10 5 3 {
  egen tmp`b'_lo = cut(tot_p), at(0(`b')300)
  egen tmp`b'_hi = cut(tot_p), at(301(`b')1200)
	*replace tmp`b'_hi = 301 if tot_p==300
  egen tmp`b'_lo_min = min(tot_p) if tot_p<300, by(tmp`b'_lo)
	egen tmp`b'_lo_max = max(tot_p) if tot_p<300, by(tmp`b'_lo)
	egen tmp`b'_hi_min = min(tot_p) if tot_p>300, by(tmp`b'_hi)
	egen tmp`b'_hi_max = max(tot_p) if tot_p>300, by(tmp`b'_hi)
	gen bin`b'_p = (tmp`b'_lo_max + tmp`b'_lo_min)/2 if tot_p<300
	replace bin`b'_p = (tmp`b'_hi_max + tmp`b'_hi_min)/2 if tot_p>300
	qui sum bin`b'_p if tot_p==301
	replace bin`b'_p = r(mean) if tot_p==300
}	
drop tmp*	

*Verify merge using GIS attribute tables, and remove the VERY few discrepancies from sample
count if merge_NL==3
count if merge_NL==3 & tot_p==tot_pop
replace sample=0 if tot_p!=tot_pop & tot_pop>0 & tot_p>0 & tot_pop!=.
drop tot_pop treat_cov merge_NL

*Define Reduced Form sample
gen rf_sample_50       = band300_p_50==1  & vplan4<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_100      = band300_p_100==1 & vplan4<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_150      = band300_p_150==1 & vplan4<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_200      = band300_p_200==1 & vplan4<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_50_vp3   = band300_p_50==1  & vplan3<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_100_vp3  = band300_p_100==1 & vplan3<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_150_vp3  = band300_p_150==1 & vplan3<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_50_vp2   = band300_p_50==1  & vplan2<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_100_vp2  = band300_p_100==1 & vplan2<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_150_vp2  = band300_p_150==1 & vplan2<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_50_vp1   = band300_p_50==1  & vplan<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_100_vp1  = band300_p_100==1 & vplan<11 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen rf_sample_150_vp1  = band300_p_150==1 & vplan<11 & sample_h==1 & sing_h==1 & pop_non_zero==1

*Define power quality variables
foreach pwr in "all" "com" "dom" "agr" {
  gen vdp_pwr_h_`pwr'_avg_11 = (vdp_pwr_h_`pwr'_sum_11 + vdp_pwr_h_`pwr'_win_11)/2
}
foreach pwr in "all" {
	cap egen temp_`pwr' = mean(vdp_pwr_h_`pwr'_avg_11) if tot_p<1000 & tot_p>150 & vd_pwr_d_`pwr'_11==1 & rf_sample_150==0, by(st_code dt_code bk_code)
	cap egen temp_`pwr' = mean(vdp_pwr_h_`pwr'_avg_11) if tot_p<1000 & tot_p>150 & vdp_pwr_d_`pwr'_11==1 & rf_sample_150==0, by(st_code dt_code bk_code)
	egen bkavg_pwr_h_`pwr'_11 = mean(temp_`pwr'), by(st_code dt_code bk_code)
}
cap drop temp*

}

********************************************************************************
********************************************************************************

** Step 4: Define Sample States (based on lights correlations)
{
gen area_corr_01_st = .
egen vdpr4_group = group(vdpr4)
forvalues st = 1/33 {
  cap correlate area_geo vd_geo_a_01 if sample==1 & vd_geo_a_01!=0 & st_code==`st'
  cap replace area_corr_01_st = r(rho) if sample==1 & st_code==`st'
}
gen corr_state = (area_corr_01_st>0.35 & area_corr_01_st<1)
unique st_code if rf_sample_150==1 & pop_non_zero==1 & corr_state==1 & sample==1

*Define First Stage sample
cap drop fs_sample_*
gen fs_sample_50       = band300_p_50==1  & vplan4<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_100      = band300_p_100==1 & vplan4<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_150      = band300_p_150==1 & vplan4<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_200      = band300_p_200==1 & vplan4<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_50_vp3   = band300_p_50==1  & vplan3<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_100_vp3  = band300_p_100==1 & vplan3<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_150_vp3  = band300_p_150==1 & vplan3<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_50_vp2   = band300_p_50==1  & vplan2<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_100_vp2  = band300_p_100==1 & vplan2<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_150_vp2  = band300_p_150==1 & vplan2<11 & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_50_vp1   = band300_p_50==1  & vplan<11  & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_100_vp1  = band300_p_100==1 & vplan<11  & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1
gen fs_sample_150_vp1  = band300_p_150==1 & vplan<11  & corr_state==1 & sample==1 & sample_h==1 & sing_h==1 & pop_non_zero==1

}

********************************************************************************
********************************************************************************

** Step 5: Calculate averge outcomes of adjacent villages (takes a long time, can skip this step!)
if 1==1{ // <-- turn to 1==0 to skip this step

preserve
keep pca01_id fs_sample_150 lights_max2001_hat lights_max2011_hat diff_max_proj_01_11 ///
     work_pooled_* centroid_y centroid_x
compress
save "$sens/panel_for_ngh_outcomes.dta", replace
restore

do "$path_code/merge/merge_spatial_ngh_outcomes.do" // LONG RUNTIME

merge 1:1 pca01_id using "$sens/panel_ngh_outcomes.dta", nogen keepusing(*ngh*)

}

********************************************************************************
********************************************************************************

** Step 6: Merge in distance to 10th/11th Plan boundaries and land gradient
{

merge m:1 st_code dt_code vi_code using "$path/data/village gis maps/villages_with_distances_all.dta", ///
	nogen keep(1 3) keepusing(distance_to_opp_plan)
replace distance_to_opp = 0 if vdpr4>10 & vdpr4<11
foreach k in 1 5 10 20 50 100 {
	gen within_border_`k'k = distance_to_opp<=`k'
}
merge m:1 st_code dt_code vi_code using "$path/data/village gis maps/elev_stata_all.dta", ///
	nogen keep(1 3) keepusing(grad_max grad_mean)
 
}

********************************************************************************
********************************************************************************

** Step 7: Label, Order, Compress, and Save
{

*Label some variables
{
cap la var c_code01 "Village identifier from ML Infomaps"
cap la var state "State"
cap la var st_code "2001 state code"
cap la var dt_code "2001 district code"
cap la var bk_code_NL "2001 block code (from night lights)"
cap la var vi_code "2001 village code"
cap la var vd11_id "2011 VD id"
cap la var maxlights_mean "Village's brightest mean brightness value"
cap la var maxlights_max "Village's brightest max brightness value"
cap la var yearmax_mean "Year of village's brightest mean brightness"
cap la var yearmax_max "Year of village's brightest max brightness"
cap la var minlights_mean "Village's darkest mean brightness value"
cap la var minlights_max "Village's darkest max brightness value"
cap la var yearmin_mean "Year of village's darkest mean brightness"
cap la var yearmin_max "Year of village's darkest max brightness"
cap la var area_geo "Village GIS area (sq km)"
cap la var centroid_x "Longitude of village centroid"
cap la var centroid_y "Latitude of village centroid"
cap la var pop_mismatch20 "Indicator for sum(hab pop) not within 20% of village pop"
cap la var delta_twell "Indicator for 2011-2001 change in tubewell status"
cap la var bkavg_pwr_h_all_11 "Blockwise avg hour of power for all end uses (2011)"
cap la var vdp_pwr_h_all_avg_11 "Avg hour of power for all end uses (2011)"
cap la var vdp_pwr_h_dom_avg_11 "Avg hour of power for domestic end use (2011)"
cap la var vdp_pwr_h_agr_avg_11 "Avg hour of power for ag use (2011)"
cap la var vdp_pwr_h_com_avg_11 "Avg hour of power for commercial use (2011)"
cap la var vdp_wat_d_any_sum_01 "Dummy for summer water (any source, 2001)"
cap la var grad_max "Maximum land gradient (degrees)"
cap la var grad_mean "Mean land gradient (degrees)"



foreach m in max mean {
	cap la var post_avg_`m'_11 "Avg `m' brightness (2011)"
	cap la var post_avg_`m'_10_11 "Avg `m' brightness (2010-2011)"
	cap la var post_avg_`m'_11_12 "Avg `m' brightness (2011-2012)"
	cap la var post_avg_`m'_10_12 "Avg `m' brightness (2010-2012)"
	cap la var post_avg_`m'_09_12 "Avg `m' brightness (2009-2012)"
	cap la var post_avg_`m'_10_13 "Avg `m' brightness (2010-2013)"
	cap la var post_avg_`m'_09_13 "Avg `m' brightness (2009-2013)"
	cap la var pre_avg_`m'_01 "Avg `m' brightness (2001)"
	cap la var pre_avg_`m'_00_01 "Avg `m' brightness (2000-2001)"
	cap la var pre_avg_`m'_01_02 "Avg `m' brightness (2001-2002)"
	cap la var pre_avg_`m'_00_02 "Avg `m' brightness (2000-2002)"
	cap la var pre_avg_`m'_99_02 "Avg `m' brightness (1999-2002)"
	cap la var pre_avg_`m'_00_03 "Avg `m' brightness (2000-2003)"
	cap la var pre_avg_`m'_99_03 "Avg `m' brightness (1999-2003)"
	cap la var diff_avg_`m'_11 "Diff avg `m' brightness (2011) - (2001)"
	cap la var diff_avg_`m'_10_11 "Diff avg `m' brightness (2010-2011) - (2000-2001)"
	cap la var diff_avg_`m'_11_12 "Diff avg `m' brightness (2011-2012) - (2001-2002)"
	cap la var diff_avg_`m'_10_12 "Diff avg `m' brightness (2010-2012) - (2000-2002)"
	cap la var diff_avg_`m'_09_12 "Diff avg `m' brightness (2009-2012) - (1999-2002)"
	cap la var diff_avg_`m'_10_13 "Diff avg `m' brightness (2010-2013) - (2000-2003)"
	cap la var diff_avg_`m'_09_13 "Diff avg `m' brightness (2009-2013) - (1999-2003)"
	cap la var ln_diff_avg_`m'_11 "Log diff avg `m' brightness (2011) - (2001)"
	cap la var ln_diff_avg_`m'_10_12 "Log diff avg `m' brightness (2010-2012) - (2000-2002)"
	cap la var ln_diff_avg_`m'_09_13 "Log diff avg `m' brightness (2009-2013) - (1999-2003)"
	cap la var lights_`m'2012_hat "Linear projection of 2012 `m' brightness"
	cap la var lights_`m'2011_hat "Linear projection of 2011 `m' brightness"
	cap la var lights_`m'2010_hat "Linear projection of 2010 `m' brightness"
	cap la var lights_`m'2009_hat "Linear projection of 2009 `m' brightness"
	cap la var lights_`m'2008_hat "Linear projection of 2008 `m' brightness"
	cap la var lights_`m'2007_hat "Linear projection of 2007 `m' brightness"
	cap la var lights_`m'2006_hat "Linear projection of 2006 `m' brightness"
	cap la var lights_`m'2005_hat "Linear projection of 2005 `m' brightness"
	cap la var lights_`m'2004_hat "Linear projection of 2004 `m' brightness"
	cap la var lights_`m'2003_hat "Linear projection of 2003 `m' brightness"
	cap la var lights_`m'2002_hat "Linear projection of 2002 `m' brightness"
	cap la var lights_`m'2001_hat "Linear projection of 2001 `m' brightness"
	cap la var lights_`m'2000_hat "Linear projection of 2000 `m' brightness"
	cap la var lights_`m'1999_hat "Linear projection of 1999 `m' brightness"
	cap la var lights_`m'1998_hat "Linear projection of 1998 `m' brightness"
	cap la var lights_`m'2011_1_hat "Linear projection of 2011 `m' brightness (adj yrs only)"
	cap la var lights_`m'2001_1_hat "Linear projection of 2001 `m' brightness (adj yrs only)"
	cap la var lights_sta_`m'2011_hat "Linear projection of 2011 `m' brightness (stable)"
	cap la var lights_sta_`m'2001_hat "Linear projection of 2001 `m' brightness (stable)"
	cap la var lights_pct_`m'2011_hat "Linear projection of 2011 `m' brightness (xpct)"
	cap la var lights_pct_`m'2001_hat "Linear projection of 2001 `m' brightness (xpct)"
	cap la var diff_`m'_proj_01_11 "Difference in linear projections, 2011 `m' - 2001 `m'"
	cap la var ln_diff_`m'_proj_01_11 "Log difference in linear projections, 2011 `m' - 2001 `m'"
	forvalues y = 1998/2013 {
		cap la var lights_`m'`y' "`y' `m' village brightness"
		cap la var lights_sta_`m'`y' "`y' `m' village brightness (STABLE)"
		cap la var lights_pct_`m'`y' "`y' `m' village brightness (XPCT)"
	}
	foreach k in 5 10 20 50 {
		cap la var lights_`m'2001_hat_ngh`k' "Avg 2001 `m' brightness of neighbors within `k'km radius"
		cap la var lights_`m'2011_hat_ngh`k' "Avg 2011 `m' brightness of neighbors within `k'km radius"
		cap la var diff_`m'_proj_01_11_ngh`k' "Avg 2011-2001 diff in `m' brightness of neighbors within `k'km radius"
	}
}	

foreach y in "01" "11" {
cap la var pop_growth_p_`y' "pop growth, 2001-2011"
cap la var pop_growth_m_`y' "male pop growth, 2001-2011"
cap la var pop_growth_f_`y' "female pop growth, 2001-2011"
cap la var work_main_ag_p_`y' "% pop ag workers"
cap la var work_marg_ag_p_`y' "% pop ag workers"
cap la var work_main_ag_m_`y' "% male pop ag workers"
cap la var work_marg_ag_m_`y' "% male pop ag workers"
cap la var work_main_ag_f_`y' "% female pop ag workers"
cap la var work_marg_ag_f_`y' "% female pop ag workers"
cap la var work_pooled_cl_p_`y' "% pop cultivators"
cap la var work_pooled_al_p_`y' "% pop agri-laborers"
cap la var work_pooled_hh_p_`y' "% pop household industry workers"
cap la var work_pooled_ot_p_`y' "% pop other workers"
cap la var work_pooled_ag_p_`y' "% pop ag workers"
cap la var work_pooled_cl_m_`y' "% male pop cultivators"
cap la var work_pooled_al_m_`y' "% male pop agri-laborers"
cap la var work_pooled_hh_m_`y' "% male pop household industry workers"
cap la var work_pooled_ot_m_`y' "% male pop other workers"
cap la var work_pooled_ag_m_`y' "% male pop ag workers"
cap la var work_pooled_cl_f_`y' "% female pop cultivators"
cap la var work_pooled_al_f_`y' "% female pop agri-laborers"
cap la var work_pooled_hh_f_`y' "% female pop household industry workers"
cap la var work_pooled_ot_f_`y' "% female pop other workers"
cap la var work_pooled_ag_f_`y' "% female pop ag workers"
cap la var hpca`y'_cook_good "% HH with electric or gas cooking"
cap la var hpca`y'_cook_bad "% HH with non-electric, non-gas cooking"
cap la var hpca`y'_msl_good "% HH with electric or solar lighting"
cap la var hpca`y'_msl_bad "% HH with non-electric, non-solar lighting"
cap la var hpca`y'_h_size_avg "Avg size of household"
cap la var pct_scst_`y' "% pop SC or ST"
cap la var pct_ag_al_`y' "% of ag workers that are agri-laborers"
cap la var pct_ag_main_`y' "% of ag workers that are main ag workers"
cap la var pct_work_ag_`y' "% of workers that are ag workers"
cap la var pct_work_main_`y' "% of workers that are main workers"
cap la var pct_irr_`y' "% of area irrigated"
cap la var pct_irr_twell_`y' "% of area irrigated by tubewell"
cap la var pct_irr_irr_twell_`y' "% of irrigated area irrigated by tubewell"
cap la var pct_sown_`y' "% of land area planted"
foreach k in 5 10 20 50 {
  cap la var work_pooled_ag_p_`y'_ngh`k' "Avg % pop ag workers of neighbors within `k'km radius"
  cap la var work_pooled_ag_m_`y'_ngh`k' "Avg % male pop ag workers of neighbors within `k'km radius"
  cap la var work_pooled_ag_f_`y'_ngh`k' "Avg % female pop ag workers of neighbors within `k'km radius"
  cap la var work_pooled_ot_p_`y'_ngh`k' "Avg % pop other workers of neighbors within `k'km radius"
  cap la var work_pooled_ot_m_`y'_ngh`k' "Avg % male pop other workers of neighbors within `k'km radius"
  cap la var work_pooled_ot_f_`y'_ngh`k' "Avg % female pop other workers of neighbors within `k'km radius"
}
}


cap la var sample "Indicator for villages in nightlights sample"
cap la var sample_h "Indicator for villages with habitation census matches"
cap la var max_treat "Indicator for RGGVY treatment at the district-level"
cap la var p_300_X "RD running variable (population), centered at 300"
cap la var p_300_D "RD insturment (population>300)"
cap la var p_300_DX "RD instrument interacted with running variable"
cap la var band300_p_50 "Indicator for having population between (250,350)"
cap la var band300_p_100 "Indicator for having population between (300,400)"
cap la var band300_p_150 "Indicator for having population between (150,450)"
cap la var band300_p_200 "Indicator for having population between (100,500)"
cap la var stdt "State-district group"
cap la var bin10_p "10-person population bins (for RD graphs)"
cap la var bin5_p "5-person population bins (for RD graphs)"
cap la var bin15_p "15-person population bins (for RD graphs)"
cap la var bin3_p "3-person population bins (for RD graphs)"
cap la var bin20_p "20-person population bins (for RD graphs)"
cap la var bin25_p "25-person population bins (for RD graphs)"
cap la var corr_state "States with correlated shapefile areas"
*cap la var corr_state2 "States with correlated shapefile areas (alt: 2001 or 2011 corr)"
cap la var pop_non_zero "Villages with non-zero, non-missing populations"
cap la var ln_area_01 "Log of village area (2001, hectares)"
cap la var area_corr_01_st "Shapefile-area correlation by state (2001)"
*cap la var area_corr_11_st "Shapefile-area correlation by state (2011)"
cap la var fs_sample_50 "Villages in RD FS sample, at 50-person bandwidth"
cap la var fs_sample_100 "Villages in RD FS sample, at 100-person bandwidth" 
cap la var fs_sample_150 "Villages in RD FS sample, at 150-person bandwidth"
cap la var fs_sample_200 "Villages in RD FS sample, at 200-person bandwidth"
cap la var fs_sample_50_vp3 "Villages in RD FS sample, at 50-person bw (plan from RGGVY microdata)"
cap la var fs_sample_100_vp3 "Villages in RD FS sample, at 100-person bw (plan from RGGVY microdata)"
cap la var fs_sample_150_vp3 "Villages in RD FS sample, at 150-person bw (plan from RGGVY microdata)"
cap la var fs_sample_50_vp2 "Villages in RD FS sample, at 50-person bw (plan from RGGVY microdata)"
cap la var fs_sample_100_vp2 "Villages in RD FS sample, at 100-person bw (plan from RGGVY microdata)"
cap la var fs_sample_150_vp2 "Villages in RD FS sample, at 150-person bw (plan from RGGVY microdata)"
cap la var fs_sample_50_vp1 "Villages in RD FS sample, at 50-person bw (plan from RGGVY microdata)"
cap la var fs_sample_100_vp1 "Villages in RD FS sample, at 100-person bw (plan from RGGVY microdata)"
cap la var fs_sample_150_vp1 "Villages in RD FS sample, at 150-person bw (plan from RGGVY microdata)"
cap la var rf_sample_50 "Villages in RD RF sample, at 50-person bandwidth"
cap la var rf_sample_100 "Villages in RD RF sample, at 100-person bandwidth" 
cap la var rf_sample_150 "Villages in RD RF sample, at 150-person bandwidth"
cap la var rf_sample_200 "Villages in RD RF sample, at 200-person bandwidth"
cap la var rf_sample_50_vp3 "Villages in RD RF sample, at 50-person bw (plan from RGGVY microdata)"
cap la var rf_sample_100_vp3 "Villages in RD RF sample, at 100-person bw (plan from RGGVY microdata)"
cap la var rf_sample_150_vp3 "Villages in RD RF sample, at 150-person bw (plan from RGGVY microdata)"
cap la var rf_sample_50_vp2 "Villages in RD RF sample, at 50-person bw (plan from RGGVY microdata)"
cap la var rf_sample_100_vp2 "Villages in RD RF sample, at 100-person bw (plan from RGGVY microdata)"
cap la var rf_sample_150_vp2 "Villages in RD RF sample, at 150-person bw (plan from RGGVY microdata)"
cap la var rf_sample_50_vp1 "Villages in RD RF sample, at 50-person bw (plan from RGGVY microdata)"
cap la var rf_sample_100_vp1 "Villages in RD RF sample, at 100-person bw (plan from RGGVY microdata)"
cap la var rf_sample_150_vp1 "Villages in RD RF sample, at 150-person bw (plan from RGGVY microdata)"
cap la var distance_to_opp "Distance to opposite plan district (km)"
foreach k in 1 5 10 20 50 100 {
	cap la var within_border_`k'k "Indicator for villages within `k'km of opposite plan district"
}
}

cap drop beta* count_pos count_neg
rename centroid_x centroid_lon
rename centroid_y centroid_lat
order pca01_id-vd11_id c_code01 v_id_master names_id h3v_id h3_id h9v_id h9_id state st_code district dt_code ///
     block bk_code village vi_code area_geo centroid* no_hh tot_p tot_m tot_f
sort pca01_id
duplicates r pca01_id     

compress

preserve 
drop no_hh11-vdp_pwr_h_all_win_11
save "$panel/panel_dataset_no_outcomes.dta", replace
restore

preserve 
drop lights_sta_max1998-lights_mean2013 maxlights_mean-yearmin_max post_avg_mean_11-lights_pct_max2001_hat
save "$panel/panel_dataset_no_lights.dta", replace
restore

preserve 
gen temp1  = band300_p_200==1 & vplan4<11 & sample_h==1 & sing_h==1
egen temp2 = rowmax(fs_sample_* temp1)
drop if temp2==0
drop temp*
save "$panel/panel_dataset_rd_sample.dta", replace
restore

save "$panel/panel_dataset_full.dta", replace

} 

********************************************************************************
********************************************************************************

** Step 8: Reshape into 2001/2011 village panel, for diff-in-diff analysis
{
use "$panel/panel_dataset_full.dta", clear

*Drop unnecessary variables and standardize 01 & 11 tags
drop lights_sta_m* lights_pct_m* lights_max???? lights_mean???? maxlights_m* yearmax_m* minlights_m* yearmin_m* ///
     post_avg_m* pre_avg_m* diff_avg_m* ln_diff_avg_m* diff_m*_proj* ln_diff_m*_proj* lights_max2010_hat-lights_max2001_1_hat ///
	 ln_area_01 delta_twell *ngh* vdpr4_group dup_pca dup_pca01 pop_growth_?_11
foreach v of varlist hpca01_* {
	local name = subinstr("`v'","01","",1)
	rename `v' `name'_01
}
foreach v of varlist hpca11_* {
	local name = subinstr("`v'","11","",1)
	rename `v' `name'_11
}	 
foreach v of varlist lights_m*2001_hat* {
	local name = subinstr("`v'","2001","",1)
	rename `v' `name'_01
}
foreach v of varlist lights_m*2011_hat* {
	local name = subinstr("`v'","2011","",1)
	rename `v' `name'_11
}
la var lights_max_hat_01 "Linear projection of max brightness"
la var lights_mean_hat_01 "Linear projection of mean brightness"
la var lights_max_hat_11 "Linear projection of max brightness"
la var lights_mean_hat_11 "Linear projection of mean brightness"

gen tot_p01 = tot_p
la var tot_p01 "2001 village population (running variable)"
foreach v of varlist no_hh tot_p tot_m tot_f {
	rename `v' `v'_01
}
foreach v of varlist no_hh11 tot_p11 tot_m11 tot_f11 {
	local name = subinstr("`v'","11","",1)
	rename `v' `name'_11
}

*Merge in 1991 PCA district-level variables
merge m:1 st_code dt_code using "$pca/pca_census91.dta", nogen keepusing(*_91)
drop area_91 no_hh_res_91

*Create 1991-only version of dataset
preserve
drop *_01 *_11		// drop 2001 & 2011 variables
rename *_91 *   	// rename 1991 variables with non-year-specific name
gen year = 1991		// define year
tempfile year1991
save "`year1991'"
restore

*Create 2001-only version of dataset
preserve
drop *_91 *_11 		// drop 1991 & 2011 variables
rename *_01 *   	// rename 2001 variables with non-year-specific name
gen year = 2001		// define year
tempfile year2001
save "`year2001'"
restore

*Create 2011-only version of dataset
preserve
drop *_91 *_01		// drop 1991 & 2001 varialbes
rename *_11 *   	// rename 2011 variables with non-year-specific name
gen year = 2011		// define year
tempfile year2011
save "`year2011'"
restore

*Combine the two datasets into a three-year panel
use "`year2001'", clear
append using "`year2011'"
append using "`year1991'"

*Drop variables that are always missing for a single year
foreach year of numlist 2001 2011 {
	foreach var of varlist * {
		cap assert mi(`var') if year == `year'
		if _rc == 0 {
			drop `var'
			di "DROPPING `var'"
		}
	}
}

*Create difference-in-difference treatment dummy
gen treat_x_post = 0
replace treat_x_post = 1 if vplan4 < 11 & year == 2011

*Indicator for being a 10th or 11th plan village (not a 10.5!)
gen sample_1011 = 0
replace sample_1011 = 1 if vplan4==10 | vplan4==11

*Define panel variables
tsset pca01_id year

*2001 population bins
local bin = 300
forvalues i = 0(`bin')2600 {
  local iplus1 = `i' + `bin'
  gen popbin_`i'_`iplus1' = tot_p01>=`i' & tot_p01<`iplus1'
  la var popbin_`i'_`iplus1' "2001 pop bin: `i'-`iplus1'"
}
gen popbin_2700_max = tot_p01>=2700 & tot_p01!=.
la var popbin_2700_max "2001 pop bin: 2700+"

*Interactions between popbins and DID treatment variable
forvalues i = 0(`bin')2600 {
  local iplus1 = `i' + `bin'
  gen dd_x_popbin_`i'_`iplus1' = treat_x_post * popbin_`i'_`iplus1'
  la var dd_x_popbin_`i'_`iplus1' "DD treat x popbin: `i'-`iplus1'"
}
gen dd_x_popbin_2700_max = treat_x_post * popbin_2700_max
la var dd_x_popbin_2700_max "DD treat x popbin 2700 to infinity"

*Interaction term between 2001 pop and treatment
gen dd_x_pop01 = treat_x_post * tot_p01

*Create state x time FE, just in case we want it 
egen stateyear = group(st_code year)

*Clean up and label
la var treat_x_post "Diff-in-diff indicator (10th-Plan, 2011)"
la var sample_1011 "Villages in RGGVY single-plan districts"
la var dd_x_pop01 "Diff-in-diff indicator x 2001 population"
la var year "Year"
la var stateyear "State-year group"
unique pca01_id year
assert r(unique)==r(N) 
sort pca01_id year
order pca01_id year

*save full three-year panel
preserve
foreach v of varlist no_hh tot_p tot_m tot_f {
	replace `v' = . if year==1991
}
compress
save "$panel/panel_dataset_dd_91.dta", replace
restore

*save as two-year panel, without 1991 observations
preserve
drop if year==1991
compress 
save "$panel/panel_dataset_dd.dta", replace
restore

*save as collapsed district-level three-year panel 
	// grab labels to assign post-collapse
foreach v of var * {
	local l`v' : variable label `v'
  if `"`l`v''"' == "" {
		local l`v' "`v'"
	}
}

	// weight-average percentage variables by population
egen tot_p_dist = sum(tot_p), by(st_code dt_code year)
foreach v of varlist lights_mean_hat-hpca_msl_nl vd_???_d_* work_main_ag_p-pct_work_main hpca_h_size_avg-pct_irr_irr_twell sing_h sample sample_h distance_to_opp_plan{
  gen temp_`v' = `v'*tot_p/tot_p_dist
	egen dist_`v' = sum(temp_`v'), by(st_code dt_code year)
	replace dist_`v' = `v' if year==1991
	drop temp_`v' `v'
}

	// collapse to the district level
collapse (mean) dist_* area_corr_01_st corr_state (count) pca01_id (sum) tot_p01 dd_x_pop01, by(year st_code dt_code state district vplan4 vdpr4 max_treat stdt treat_x_post sample_1011 stateyear) fast
foreach v of varlist dist_* {
  local v2 = substr("`v'",6,100)
	rename `v' `v2'
	la var `v2' "`l`v2'' (DISTRICT)"
}
la var tot_p01 "`ltot_p01'"
la var dd_x_pop01 "`ltot_p01'"
la var pca01_id "Number of 2001 villages"
sort st_code dt_code year
order year st_code state dt_code district vplan-vdpr4 max_treat stdt area_corr_01_st corr_state treat_x_post sample_1011 stateyear tot_p01 pca01_id
compress 
save "$panel/panel_dataset_dd_91dt.dta", replace

}

********************************************************************************
********************************************************************************
